This script assembles R object from data downloaded from NCBI/Gene Expression Omnibus Database (GSE112679). The R objects will be assembled into an R dat package for easy access. The objects are constructed to parallel the analyses of these data are reported in Cai et al. (2019) [1]).
GSE_ID <- 'GSE112679'
# extData
################################
if(file.exists(file.path('../../extData')))
EXT_DIR <- file.path('../../extData') else stop("Speficy EXT_DIR")
# GSE_Data
###################################
if(file.exists(file.path(EXT_DIR, GSE_ID)))
GSE_DATA_DIR <- file.path(EXT_DIR,GSE_ID) else stop("Speficy GSE_DATA_DIR")
# SampleDesc
###################################
if(file.exists(file.path(GSE_DATA_DIR, paste0(GSE_ID,'_series_matrix.txt'))))
SAMPLE_DESC_FILE <- file.path(GSE_DATA_DIR, paste0(GSE_ID,'_series_matrix.txt')) else
stop("Speficy SAMPLE_DESC_FILE")
# Define utility function
readSeqData.f <- function(seqFile, Verbose=F){
seqFile.frm <- fread(seqFile)
names(seqFile.frm) <- c('GeneId','Chr', 'Start', 'End', 'Strand', 'Length', 'Count')
# Reformat for ordering
seqFile.frm$Chr <-
as.character(factor(seqFile.frm$Chr,
levels=paste0('chr', c(1:22,'X','Y','M')),
labels=paste0('chr', c(formatC(1:22,width=2,flag='0'),'X','Y','M'))))
rownames(seqFile.frm) <-
with(seqFile.frm,
paste(Chr, Strand, Start, End, GeneId, sep='~'))
seqFile.frm <- seqFile.frm[,-(1:6),drop=F]
if(Verbose) print(seqFile.frm[1:6,,drop=F])
if(Verbose) print(dim(seqFile.frm))
seqFile.frm}
# kelly's colors - https://i.kinja-img.com/gawker-media/image/upload/1015680494325093012.JPG
# https://gist.github.com/ollieglass/f6ddd781eeae1d24e391265432297538
#KellyColors.vec <- see web site
# REMOVED '#F2F3F4' in first entry
col_vec <- c('#222222', '#F3C300', '#875692', '#F38400', '#A1CAF1',
'#BE0032', '#C2B280', '#848482', '#008856', '#E68FAC', '#0067A5',
'#F99379', '#604E97', '#F6A600', '#B3446C', '#DCD300', '#882D17',
'#8DB600', '#654522', '#E25822', '#2B3D26')
suppressPackageStartupMessages(require(GEOquery))
GSEMatrix_obj <- getGEO(GSE_ID, GSEMatrix=T)
## Found 1 file(s)
## GSE112679_series_matrix.txt.gz
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
## File stored at:
## /var/folders/wd/v5lfllh152x0hwhtr773cnmnjqcpdt/T//RtmpPWAJZr/GPL18573.soft
show(GSEMatrix_obj)
## $GSE112679_series_matrix.txt.gz
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 0 features, 2606 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM3076193 GSM3076194 ... GSM3905974 (2606 total)
## varLabels: title geo_accession ... training/validation group:ch1 (46
## total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 31358576
## Annotation: GPL18573
DT::datatable(pData(phenoData(GSEMatrix_obj[[1]])))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
KEY_FIELDS <- grep(':ch1$', names(pData(phenoData(GSEMatrix_obj[[1]]))), value=T)
sampType_vec <- sapply(
strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"title"]),
split='_'), function(X) X[1])
sampNo_vec <- sapply(
strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"title"]),
split='_'), function(X) X[2])
bioSamples_vec <- sapply(
strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"relation"]),
split='/'), function(X) rev(X)[1])
SRA_vec <- sapply(
strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"relation.1"]),
split='='), function(X) rev(X)[1])
fileName_vec <- sapply(
strsplit(as.character(pData(phenoData(GSEMatrix_obj[[1]]))[,"supplementary_file_1"]),
split='/'), function(X) rev(X)[1])
sampDesc_frm <- data.frame(
geoAcc=pData(phenoData(GSEMatrix_obj[[1]]))[,"geo_accession"],
title=pData(phenoData(GSEMatrix_obj[[1]]))[,"title"],
sampType=sampType_vec,
sampNo=sampNo_vec,
bioSample=bioSamples_vec,
SRA=SRA_vec,
fileName=fileName_vec,
pData(phenoData(GSEMatrix_obj[[1]]))[, KEY_FIELDS])
names(sampDesc_frm) <-
sub('bclc.stage.ch1', 'stage',
sub('diagnosis.ch1', 'Dx',
sub('tissue.subtype.ch1', 'tissueSubtype',
sub('tissue.ch1', 'tissue',
sub('training.validation.group.ch1', 'trainValGroup',
names(sampDesc_frm))))))
# Make sure fileName and geoAcc match
fileNamegeoAcc.vec <- sapply(
strsplit(as.character(sampDesc_frm$fileName), split='_'), '[',1)
if(sum(fileNamegeoAcc.vec!= sampDesc_frm$geoAcc))
stop("geoAcc/fileName Mismatch")
str(sampDesc_frm)
## 'data.frame': 2606 obs. of 12 variables:
## $ geoAcc : chr "GSM3076193" "GSM3076194" "GSM3076195" "GSM3076196" ...
## $ title : chr "blood_1" "blood_2" "blood_3" "blood_4" ...
## $ sampType : chr "blood" "blood" "blood" "blood" ...
## $ sampNo : chr "1" "2" "3" "4" ...
## $ bioSample : chr "SAMN08869478" "SAMN08869477" "SAMN08869476" "SAMN08869475" ...
## $ SRA : chr "SRX3890002" "SRX3890003" "SRX3890004" "SRX3890005" ...
## $ fileName : chr "GSM3076193_Seq_1.genebody.txt.gz" "GSM3076194_Seq_2.genebody.txt.gz" "GSM3076195_Seq_3.genebody.txt.gz" "GSM3076196_Seq_4.genebody.txt.gz" ...
## $ stage : chr NA NA NA NA ...
## $ Dx : chr "CHB" "Liver cirrhosis" "Liver cirrhosis" "Healthy" ...
## $ tissueSubtype: chr NA NA NA NA ...
## $ tissue : chr "plasma" "plasma" "plasma" "plasma" ...
## $ trainValGroup: chr "Training" "Training" "Training" "Training" ...
cat("geoAcc is unique - use as rownames:\n")
## geoAcc is unique - use as rownames:
with(sampDesc_frm, table(table(geoAcc, exclude=NULL)))
##
## 1
## 2606
rownames(sampDesc_frm) <- sampDesc_frm$geoAcc
cat("SRA is unique:\n")
## SRA is unique:
with(sampDesc_frm, table(table(SRA, exclude=NULL)))
##
## 1
## 2606
cat("bioSample is unique:\n")
## bioSample is unique:
with(sampDesc_frm, table(table(bioSample, exclude=NULL)))
##
## 1
## 2606
cat("title is unique:\n")
## title is unique:
with(sampDesc_frm, table(table(title, exclude=NULL)))
##
## 1
## 2606
cat("Some Samples Match bu sampNo:")
## Some Samples Match bu sampNo:
with(sampDesc_frm, table(table(sampNo, exclude=NULL)))
##
## 1 2 3
## 2531 3 23
# NOTE: examination of the data indicate that sampNo cannot be used
# to match Blood with TU or TI samples
DEPRICATED <- function() {
NoSamp.tbl <- with(sampDesc_frm, table(sampNo, exclude=NULL))
sampDesc_frm <- merge(
data.frame(sampNo=names(nSamp.tbl), nSamp=as.vector(nSamp.tbl)),
sampDesc_frm, by='sampNo', all.y=T)
}#DEPRICATED
sampDesc_frm <- sampDesc_frm[with(sampDesc_frm,
order(as.numeric(sampNo), title)),]
# Shorten trainValGroup
sampDesc_frm$trainValGroup <-
sub('Training', 'Train',
sub('Validation', 'Val',
sampDesc_frm$trainValGroup))
# Shorten Dx
sampDesc_frm$Dx <-
sub('Benign liver lesions', 'Benign',
sub('Liver cirrhosis', 'Cirrhosis',
sampDesc_frm$Dx))
trainValGroupDX.tbl <-
with(sampDesc_frm,
table(Dx_tissue=paste0(Dx,'_',tissue), trainValGroup, exclude=NULL))
trainValGroupDX.tbl
## trainValGroup
## Dx_tissue Train Val-1 Val-2 <NA>
## Benign_plasma 253 132 3 0
## CHB_plasma 190 96 0 0
## Cirrhosis_plasma 73 33 0 0
## HCC_liver 0 0 0 52
## HCC_plasma 335 809 60 0
## Healthy_plasma 269 124 177 0
barplot(trainValGroupDX.tbl, beside=T, legend=T,
col=col_vec[1:nrow(trainValGroupDX.tbl)])
title("Sample Counts by Tissue Type")
# For consistency with previous code, we will use Outcome as an alias to Dx,
# and sampID as an alias to geoAcc
sampDesc_frm$outcome <- sampDesc_frm$Dx
sampDesc_frm$sampID <- sampDesc_frm$geoAcc
cat("Cai et al. combine Beign+Healthy and Cirrhosis+HCC\n")
## Cai et al. combine Beign+Healthy and Cirrhosis+HCC
cat("Create secondary outcome\n")
## Create secondary outcome
sampDesc_frm$outcome2 <- with(sampDesc_frm,
ifelse(Dx %in% c("Benign", "Healthy"), 'BenignHealthy',
ifelse(Dx %in% c("Cirrhosis", "CHB"), 'CirrhosisCHB', Dx)))
with(sampDesc_frm, table(trainValGroup, outcome2, exclude=NULL))
## outcome2
## trainValGroup BenignHealthy CirrhosisCHB HCC
## Train 522 263 335
## Val-1 256 129 809
## Val-2 180 0 60
## <NA> 0 0 52
cat("Also want outcome3 == HCC or nonHCC\n")
## Also want outcome3 == HCC or nonHCC
sampDesc_frm$outcome3 <- with(sampDesc_frm,
ifelse(Dx == 'HCC', 'HCC', 'nonHCC'))
with(sampDesc_frm, table(trainValGroup, outcome3, exclude=NULL))
## outcome3
## trainValGroup HCC nonHCC
## Train 335 785
## Val-1 809 385
## Val-2 60 180
## <NA> 52 0
usethis::use_data(sampDesc_frm, overwrite=T)
## ✔ Setting active project to '/Users/fcollin/Documents/Projects/Rstuff/devPackages/GSE112679'
## ✔ Saving 'sampDesc_frm' to 'data/sampDesc_frm.rda'
## ● Document your data (see 'https://r-pkgs.org/data.html')
DT::datatable(sampDesc_frm, options=list(pageLength = 18))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
1. Cai, J., Chen, L., Zhang, Z., Zhang, X., Lu, X., Liu, W., Shi, G., Ge, Y., Gao, P., and Yang, Y. et al. Genome-wide mapping of 5-hydroxymethylcytosines in circulating cell-free dna as a non-invasive approach for early detection of hepatocellular carcinoma. Gut, gutjnl–2019–318882. Available at: http://gut.bmj.com/content/early/2019/07/28/gutjnl-2019-318882.abstract.